import time

from bs4 import BeautifulSoup
import requests
from openpyxl import load_workbook
from pandas import DataFrame


headers = {
        'user-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
        'cookie': 'QCCSESSID=k0jt5oqanht3dbmh8ek3a0meu2'
        }

def get_company_message(company):
    # 获取查询到的网页内容（全部）
    req = requests.get('https://www.qcc.com/search?key={}'.format(company), headers=headers)
    # req.raise_for_status()
    req.encoding = 'utf-8'  # linux utf-8
    # print(req.text)
    time.sleep(2)
    soup = BeautifulSoup(req.text, features="html.parser")
    if soup.find_all('a', {'class': 'title'}):
        href = soup.find_all('a', {'class': 'title'})[0].get('href')
    else:
        return
    print(href)
    # 获取查询到的网页内容（全部）
    details = requests.get(href, headers=headers, timeout=10)
    details.raise_for_status()
    details.encoding = 'utf-8'  # linux utf-8
    print(details.text)
    details_soup = BeautifulSoup(details.text, features="html.parser")
    try:
        message = details_soup.select('#Cominfo')[0].text
    except:
        message = None
    time.sleep(2)
    return message

# 读取表格
def get_companys(sheet_name):
    companys = []
    # 读取路径
    book = load_workbook(filename=r"济宁.xlsx")
    # 读取名字为Sheet1的表
    sheet = book['Sheet1']
    # 用于存储数据的数组
    row_num = 2

    while 1:
        company = sheet.cell(row=row_num, column=1).value
        if company:
            companys.append(company)
            row_num = row_num + 1
        else:
            break
    book.close()
    print(companys)
    return companys
# 写入表格
def input_execl(datas,sheet_name):
    df = DataFrame(datas)
    df.to_excel('济宁1.xlsx',sheet_name='Sheet1')
    print('爬取成功！！')
# 分解爬取的信息
company_name = []
name = []
no = []
status = []
type = []
addr = []
home = []
def getItem(item):
    if len(item.split('纳税人识别号'))>1 and len(item.split('纳税人识别号')[1].split('\n'))>1:
        no.append(item.split('纳税人识别号')[1].split('\n')[1].replace(' ', ''))
    else:
        if len(item.split('统一社会信用代码'))>1 and len(item.split('统一社会信用代码')[1].split('\n'))>1:
            no.append(item.split('统一社会信用代码')[1].split('\n')[1].replace(' ', ''))
        else:
            no.append('未获取到！')

    if len(item.split('法定代表人'))>1 and len(item.split('法定代表人')[1].split('\n'))>1 and len(item.split('法定代表人')[1].split('\n')[1].split('关联'))>1:
        name.append(item.split('法定代表人')[1].split('\n')[1].split('关联')[0].replace(' ', ''))
    else:
        name.append('未获取到！')

    if len(item.split('登记状态'))>1 and len(item.split('登记状态')[1].split('\n'))>1 and len(item.split('登记状态')[1].split('\n')[1].split('成立日期'))>1:
        status.append(item.split('登记状态')[1].split('\n')[1].split('成立日期')[0].replace(' ', ''))
    else:
        if len(item.split('登记状态'))>1 and len(item.split('登记状态')[1].split('\n'))>1 and len(item.split('登记状态')[1].split('\n')[1].split('举办单位'))>1:
            status.append(item.split('登记状态')[1].split('\n')[1].split('举办单位')[0].replace(' ', ''))
        else:
            status.append('未获取到！')

    if len(item.split('企业类型'))>1 and len(item.split('企业类型')[1].split('\n'))>1:
        type.append(item.split('企业类型')[1].split('\n')[1].replace(' ', ''))
    else:
        type.append('未获取到！')

    if len(item.split('地址'))>1:
        addr.append(item.split('地址')[1].split('附近企业')[0].split('宗旨和业务范围')[0].replace(' ', '').replace('\n', ''))
    else:
        addr.append('未获取到！')

    if len(item.split('登记机关'))>1 and len(item.split('登记机关')[1].split('\n'))>1:
        home.append(item.split('登记机关')[1].split('\n')[1].replace(' ', ''))
    else:
        home.append('未获取到！')
for i in range(1,2):
    company_name = []
    name = []
    no = []
    status = []
    type = []
    addr = []
    home = []
    sheet_name = 'Sheet'+str(i)
    print(sheet_name)
    companys = get_companys(sheet_name)
    company_no = []
    comoany_name = []

    for company in companys:
        messages = get_company_message(company)
        print(company)
        company_name.append(company)
        print(messages)
        if messages:
            print('-------------')
            getItem(messages)
        else:
            print('未获取到------')
            no.append('未获取到！')
            name.append('未获取到！')
            status.append('未获取到！')
            type.append('未获取到！')
            addr.append('未获取到！')
            home.append('未获取到！')
    datas = {}
    datas['企业名称'] = company_name
    datas['纳税人识别号'] = no
    datas['法定代表人'] = name
    datas['登记状态'] = status
    datas['企业类型'] = type
    datas['企业地址'] = addr
    datas['登记机关'] = home
    print(datas)
    input_execl(datas,sheet_name)